The data was downloaded from https://www.kaggle.com/datasets/uciml/mushroom-classification.
Given features of a mushroom, we want to classify it as poisonous or
edible.
options(
digits = 2, # Just gives 2 significant digits
scipen = 999, # The larger the value the more it avoids scientific notation when printing
warn = -1 # Supress warnings
)
rm(
list = ls()
)
library(magrittr)
M <- readr::read_csv(
file = "/Users/thienpham/Data Mining/data/mushrooms.csv",
col_types = "ccccccccccccccccccccccc", # Loading everything as a character in case ID:0234 so we keep 0 and not lose it
name_repair = janitor::make_clean_names # Cleans up column names if messy
) %>%
dplyr::mutate(
target = as.numeric(class == "e") # Make new column called target that converts to binary 1 when var class == "e" (edible)
) %>%
dplyr::select(-veil_type) %>% # veil_type was dropped because it was constant (all same value = meaningless)
as.data.frame()
v_class <- sapply(
X = M,
FUN = class # class function tells you the type of object of each variable
)
v_class
## class cap_shape cap_surface
## "character" "character" "character"
## cap_color bruises odor
## "character" "character" "character"
## gill_attachment gill_spacing gill_size
## "character" "character" "character"
## gill_color stalk_shape stalk_root
## "character" "character" "character"
## stalk_surface_above_ring stalk_surface_below_ring stalk_color_above_ring
## "character" "character" "character"
## stalk_color_below_ring veil_color ring_number
## "character" "character" "character"
## ring_type spore_print_color population
## "character" "character" "character"
## habitat target
## "character" "numeric"
v_character <- names(v_class)[v_class == "character"] # prints out the names of variables that are character
v_character
## [1] "class" "cap_shape"
## [3] "cap_surface" "cap_color"
## [5] "bruises" "odor"
## [7] "gill_attachment" "gill_spacing"
## [9] "gill_size" "gill_color"
## [11] "stalk_shape" "stalk_root"
## [13] "stalk_surface_above_ring" "stalk_surface_below_ring"
## [15] "stalk_color_above_ring" "stalk_color_below_ring"
## [17] "veil_color" "ring_number"
## [19] "ring_type" "spore_print_color"
## [21] "population" "habitat"
v_character <- v_character[v_character != "class"] # all character variables with exception of class, these are our predictor variables
for(j in v_character){
M[is.na(M[,j]),j] <- "N/A" # turn all missing values into "N/A"
M[M[,j] %in% c("NA",""),j] <- "N/A" # goes through every column, if theres an "NA" or "", change to "N/A"
}
We visualize our data to
library(ggplot2)
M_plot <- M %>%
dplyr::group_by(class) %>% # group_by happens under the hood, needed for next steps
dplyr::summarise(n = dplyr::n()) %>% # sum up group specified, names column n
dplyr::ungroup() %>% # ungroup for next step
dplyr::mutate(percent = round(100*n/sum(n)),label = paste0(n,", ",percent,"%")) %>% # shows percentage
as.data.frame()
ggplot(M_plot) +
aes(x = class,y = n,fill = class,label = label) +
geom_col() +
geom_text(position = position_stack(0.5)) +
labs(
title = "Bar plot of class",
caption = "Data source: https://www.kaggle.com/datasets/uciml/mushroom-classification"
)
Wouldnt normally do for ordinal data b/c it disrupts the order but makes sense for nominal
for(j in v_character) M[,j] <- forcats::fct_infreq(
f = M[,j]
)
This uses the Chi-squared test to establish an order of our predictors from most informative to least informative.
# Take the target variable and all the predictor variables, make a table, and then apply chi sq test on the table.
# This returns a vector of p
v_character <- names(sort(sapply(
X = v_character,
FUN = function(j) chisq.test(
x = table(M[,c("class",j)])
)$p.value
)))
v_character
## [1] "bruises" "odor"
## [3] "gill_size" "gill_color"
## [5] "stalk_surface_above_ring" "stalk_surface_below_ring"
## [7] "stalk_color_above_ring" "stalk_color_below_ring"
## [9] "ring_type" "spore_print_color"
## [11] "population" "habitat"
## [13] "stalk_root" "gill_spacing"
## [15] "cap_shape" "ring_number"
## [17] "cap_color" "cap_surface"
## [19] "veil_color" "gill_attachment"
## [21] "stalk_shape"
# This is what the table look like in order to calculate the chi sq test
table(M[,c("class","bruises")])
## bruises
## class f t
## e 1456 2752
## p 3292 624
These visualizations help us
Dr. Smith suppressed a lot of warning messages from this chunk.
v_color <- c(
"red","forestgreen","skyblue"
)
names(v_color) <- c(
"(0,33]","(33,67]","(67,100]" # creates 3 color categories corresponding to 3 percentage range
)
for(j in v_character){ # goes through for each predictor variable
M_plot <- M %>%
dplyr::select_("class",j) %>% # selects only the target variable and jth predictor variable
dplyr::group_by_("class",j) %>% # then groups by levels of the target variable then the predictor variable
dplyr::summarise(n = dplyr::n()) %>% # then sums up each combination
dplyr::group_by_(j) %>% # now group by the predictor variable, then calculates percentage
dplyr::mutate(percent = round(100*n/sum(n)),label = paste0(n,"\n",percent,"%"),percent = cut(percent,breaks = c(0,33,67,100)))
p <- ggplot(M_plot) +
aes_string(x = j) +
aes(y = class,label = label) +
geom_point(aes(size = n,color = percent)) +
geom_text() +
scale_size_area(max_size = 20) +
scale_color_manual(values = v_color) +
theme_bw() +
labs(
title = paste0("Bubble plot of class by ",j),
subtitle = "Large red and blue bubbles are better for predictor variables.",
caption = paste0(
"Data source: https://www.kaggle.com/datasets/uciml/mushroom-classification",
"\n",
"Chi-squared test p-value: ",round(chisq.test(table(M[,c("class",j)]))$p.value,22)
)
)
plot(
x = p
)
}
########################################## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# For the first plot Bruises, we can automatically classify that a mushroom will be edible if there are bruises and poisonous if there # are no bruises. We also notice that as we progress down the predictor variables with higher chi sq p values (less significant) we # will see more green percentage range indicating that the result are not as meaningful (we cant automatically classify right away # without more extensive testing)
During feature selection we will use our association analysis to
identify which predictors to keep and which to drop.
* The bigger and darker the circles, there is more association
M_CramerV <- DescTools::PairApply(
x = M[,v_character],
FUN = DescTools::CramerV
)
corrplot::corrplot(
corr = M_CramerV,
diag = FALSE,
is.corr = FALSE,
order = "hclust",
hclust.method = "ward.D"
)
This will product a hierarchical clustering based on association between categorical variables. If 2 variables are highly associated then keeping both will result in colinearity problems. Goal is to go in and identify for feature selection purposes, if I remove a predictor variable, with another pick up the slack.
plot(
x = hclust(
d = as.dist(
m = 1 - M_CramerV
),
method = "ward.D" # ward's D will give us more even subgroups
)
)
# If we remove bruises, ring_type could still provide us with similar information. Reducing the number of predictor variables needed while still giving us as much information.
plot(
x = hclust(
d = as.dist(
m = 1 - M_CramerV
),
method = "single" # single linkage does a better job at finding outliers
)
)
# With single linkage we can see that cap_surface and cap_shape are very different from all of the other predictor variables therefore we would not want to remove these variables since they contain unique information.
To make model fitting faster, and to prevent over-fitting categorical and ordinal columns will be binned into two levels and represented with a binary column. The binning strategy will try for 50/50 bins meaning 2 groups.
two strategies
After binning into 2 groups, the first group will be all 0’s in the binary predictor variable, and the second group all 1’s.
Which variables would use which method was decided beforehand. Normally you would do one method for all predictor variables and if there wasnt any clear binning groups then run the other method on the remaining variables.
v_mean <- c(
"cap_color","bruises","gill_color","ring_type","spore_print_color","population","cap_shape","odor","stalk_shape","stalk_color_below_ring","stalk_color_above_ring"
)
v_other <- c(
"veil_color","gill_attachment","ring_number","gill_spacing",
"habitat",
"cap_surface","gill_size","stalk_root","stalk_surface_above_ring","stalk_surface_below_ring"
)
General rule used for the “other” method is roughly 1/3 and 2/3 if no clear distinction, ex: a=.39 b=.23 c=.19 d=.12 e=.7 then bin a into 1 group and everything else into other.
for(j in v_other){
v_table <- sort(
x = table(
x = M[,j]
),
decreasing = TRUE
)
M[,j] <- as.numeric(
x = M[,j] == names(v_table)[1]
)
v_table <- unclass(
x = v_table
)
M_class <- data.frame(
level = names(v_table),
n = v_table,
proportion = prop.table(v_table)
)
print(
x = "---------------------------------------------------------------------"
)
print(
x = j
)
print(
x = M_class
)
}
## [1] "---------------------------------------------------------------------"
## [1] "veil_color"
## level n proportion
## w w 7924 0.97538
## n n 96 0.01182
## o o 96 0.01182
## y y 8 0.00098
## [1] "---------------------------------------------------------------------"
## [1] "gill_attachment"
## level n proportion
## f f 7914 0.974
## a a 210 0.026
## [1] "---------------------------------------------------------------------"
## [1] "ring_number"
## level n proportion
## o o 7488 0.9217
## t t 600 0.0739
## n n 36 0.0044
## [1] "---------------------------------------------------------------------"
## [1] "gill_spacing"
## level n proportion
## c c 6812 0.84
## w w 1312 0.16
## [1] "---------------------------------------------------------------------"
## [1] "habitat"
## level n proportion
## d d 3148 0.387
## g g 2148 0.264
## p p 1144 0.141
## l l 832 0.102
## u u 368 0.045
## m m 292 0.036
## w w 192 0.024
## [1] "---------------------------------------------------------------------"
## [1] "cap_surface"
## level n proportion
## y y 3244 0.39931
## s s 2556 0.31462
## f f 2320 0.28557
## g g 4 0.00049
## [1] "---------------------------------------------------------------------"
## [1] "gill_size"
## level n proportion
## b b 5612 0.69
## n n 2512 0.31
## [1] "---------------------------------------------------------------------"
## [1] "stalk_root"
## level n proportion
## b b 3776 0.465
## ? ? 2480 0.305
## e e 1120 0.138
## c c 556 0.068
## r r 192 0.024
## [1] "---------------------------------------------------------------------"
## [1] "stalk_surface_above_ring"
## level n proportion
## s s 5176 0.637
## k k 2372 0.292
## f f 552 0.068
## y y 24 0.003
## [1] "---------------------------------------------------------------------"
## [1] "stalk_surface_below_ring"
## level n proportion
## s s 4936 0.608
## k k 2304 0.284
## f f 600 0.074
## y y 284 0.035
for(j in v_mean){
M_target <- M %>%
dplyr::select_(j,"target") %>%
dplyr::mutate_(j = as.character(j)) %>%
dplyr::group_by_(j) %>%
dplyr::summarise(target = mean(target,na.rm = TRUE),n = dplyr::n()) %>%
dplyr::ungroup() %>%
dplyr::mutate(proportion = n/sum(n)) %>%
dplyr::arrange(target) %>%
dplyr::mutate(cumsum_ascending = cumsum(proportion)) %>%
dplyr::arrange(dplyr::desc(cumsum_ascending)) %>%
dplyr::mutate(cumsum_descending = 1 - cumsum(proportion)) %>%
dplyr::arrange(cumsum_ascending) %>%
dplyr::mutate(mean_cumsum = (cumsum_ascending + cumsum_descending)/2) %>%
dplyr::arrange(mean_cumsum) %>%
as.data.frame()
v_j <- levels(M_target[,j])[M_target$mean_cumsum >= 0.5]
M[,j] <- as.numeric(
x = M[,j] %in% M_target[M_target$mean_cumsum >= 0.5,j]
)
print(
x = "---------------------------------------------------------------------"
)
print(
x = j
)
print(knitr::kable(
M_target
))
}
## [1] "---------------------------------------------------------------------"
## [1] "cap_color"
##
##
## |cap_color | target| n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:---------|------:|----:|----------:|----------------:|-----------------:|-----------:|
## |b | 0.29| 168| 0.02| 0.02| 0.00| 0.01|
## |y | 0.37| 1072| 0.13| 0.15| 0.02| 0.09|
## |p | 0.39| 144| 0.02| 0.17| 0.15| 0.16|
## |e | 0.42| 1500| 0.18| 0.35| 0.17| 0.26|
## |n | 0.55| 2284| 0.28| 0.64| 0.35| 0.50|
## |g | 0.56| 1840| 0.23| 0.86| 0.64| 0.75|
## |w | 0.69| 1040| 0.13| 0.99| 0.86| 0.93|
## |c | 0.73| 44| 0.01| 1.00| 0.99| 0.99|
## |r | 1.00| 16| 0.00| 1.00| 1.00| 1.00|
## |u | 1.00| 16| 0.00| 1.00| 1.00| 1.00|
## [1] "---------------------------------------------------------------------"
## [1] "bruises"
##
##
## |bruises | target| n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:-------|------:|----:|----------:|----------------:|-----------------:|-----------:|
## |f | 0.31| 4748| 0.58| 0.58| 0.00| 0.29|
## |t | 0.82| 3376| 0.42| 1.00| 0.58| 0.79|
## [1] "---------------------------------------------------------------------"
## [1] "gill_color"
##
##
## |gill_color | target| n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:----------|------:|----:|----------:|----------------:|-----------------:|-----------:|
## |b | 0.00| 1728| 0.21| 0.21| 0.00| 0.11|
## |r | 0.00| 24| 0.00| 0.22| 0.21| 0.21|
## |h | 0.28| 732| 0.09| 0.31| 0.22| 0.26|
## |g | 0.33| 752| 0.09| 0.40| 0.31| 0.35|
## |p | 0.57| 1492| 0.18| 0.58| 0.40| 0.49|
## |y | 0.74| 86| 0.01| 0.59| 0.58| 0.59|
## |w | 0.80| 1202| 0.15| 0.74| 0.59| 0.67|
## |k | 0.84| 408| 0.05| 0.79| 0.74| 0.77|
## |n | 0.89| 1048| 0.13| 0.92| 0.79| 0.86|
## |u | 0.90| 492| 0.06| 0.98| 0.92| 0.95|
## |e | 1.00| 96| 0.01| 0.99| 0.98| 0.99|
## |o | 1.00| 64| 0.01| 1.00| 0.99| 1.00|
## [1] "---------------------------------------------------------------------"
## [1] "ring_type"
##
##
## |ring_type | target| n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:---------|------:|----:|----------:|----------------:|-----------------:|-----------:|
## |l | 0.00| 1296| 0.16| 0.16| 0.00| 0.08|
## |n | 0.00| 36| 0.00| 0.16| 0.16| 0.16|
## |e | 0.36| 2776| 0.34| 0.51| 0.16| 0.33|
## |p | 0.79| 3968| 0.49| 0.99| 0.51| 0.75|
## |f | 1.00| 48| 0.01| 1.00| 0.99| 1.00|
## [1] "---------------------------------------------------------------------"
## [1] "spore_print_color"
##
##
## |spore_print_color | target| n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:-----------------|------:|----:|----------:|----------------:|-----------------:|-----------:|
## |r | 0.00| 72| 0.01| 0.01| 0.00| 0.00|
## |h | 0.03| 1632| 0.20| 0.21| 0.01| 0.11|
## |w | 0.24| 2388| 0.29| 0.50| 0.21| 0.36|
## |k | 0.88| 1872| 0.23| 0.73| 0.50| 0.62|
## |n | 0.89| 1968| 0.24| 0.98| 0.73| 0.86|
## |b | 1.00| 48| 0.01| 0.98| 0.98| 0.98|
## |o | 1.00| 48| 0.01| 0.99| 0.98| 0.99|
## |u | 1.00| 48| 0.01| 0.99| 0.99| 0.99|
## |y | 1.00| 48| 0.01| 1.00| 0.99| 1.00|
## [1] "---------------------------------------------------------------------"
## [1] "population"
##
##
## |population | target| n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:----------|------:|----:|----------:|----------------:|-----------------:|-----------:|
## |v | 0.30| 4040| 0.50| 0.50| 0.00| 0.25|
## |y | 0.62| 1712| 0.21| 0.71| 0.50| 0.60|
## |s | 0.71| 1248| 0.15| 0.86| 0.71| 0.78|
## |c | 0.85| 340| 0.04| 0.90| 0.86| 0.88|
## |n | 1.00| 400| 0.05| 0.95| 0.90| 0.93|
## |a | 1.00| 384| 0.05| 1.00| 0.95| 0.98|
## [1] "---------------------------------------------------------------------"
## [1] "cap_shape"
##
##
## |cap_shape | target| n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:---------|------:|----:|----------:|----------------:|-----------------:|-----------:|
## |c | 0.00| 4| 0.00| 0.00| 0.00| 0.00|
## |k | 0.28| 828| 0.10| 0.10| 0.00| 0.05|
## |f | 0.51| 3152| 0.39| 0.49| 0.10| 0.30|
## |x | 0.53| 3656| 0.45| 0.94| 0.49| 0.72|
## |b | 0.89| 452| 0.06| 1.00| 0.94| 0.97|
## |s | 1.00| 32| 0.00| 1.00| 1.00| 1.00|
## [1] "---------------------------------------------------------------------"
## [1] "odor"
##
##
## |odor | target| n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:----|------:|----:|----------:|----------------:|-----------------:|-----------:|
## |f | 0.00| 2160| 0.27| 0.27| 0.00| 0.13|
## |s | 0.00| 576| 0.07| 0.34| 0.27| 0.30|
## |y | 0.00| 576| 0.07| 0.41| 0.34| 0.37|
## |p | 0.00| 256| 0.03| 0.44| 0.41| 0.42|
## |c | 0.00| 192| 0.02| 0.46| 0.44| 0.45|
## |m | 0.00| 36| 0.00| 0.47| 0.46| 0.47|
## |n | 0.97| 3528| 0.43| 0.90| 0.47| 0.68|
## |a | 1.00| 400| 0.05| 0.95| 0.90| 0.93|
## |l | 1.00| 400| 0.05| 1.00| 0.95| 0.98|
## [1] "---------------------------------------------------------------------"
## [1] "stalk_shape"
##
##
## |stalk_shape | target| n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:-----------|------:|----:|----------:|----------------:|-----------------:|-----------:|
## |e | 0.46| 3516| 0.43| 0.43| 0.00| 0.22|
## |t | 0.56| 4608| 0.57| 1.00| 0.43| 0.72|
## [1] "---------------------------------------------------------------------"
## [1] "stalk_color_below_ring"
##
##
## |stalk_color_below_ring | target| n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:----------------------|------:|----:|----------:|----------------:|-----------------:|-----------:|
## |b | 0.00| 432| 0.05| 0.05| 0.00| 0.03|
## |c | 0.00| 36| 0.00| 0.06| 0.05| 0.06|
## |y | 0.00| 24| 0.00| 0.06| 0.06| 0.06|
## |n | 0.12| 512| 0.06| 0.12| 0.06| 0.09|
## |p | 0.31| 1872| 0.23| 0.35| 0.12| 0.24|
## |w | 0.62| 4384| 0.54| 0.89| 0.35| 0.62|
## |g | 1.00| 576| 0.07| 0.96| 0.89| 0.93|
## |o | 1.00| 192| 0.02| 0.99| 0.96| 0.98|
## |e | 1.00| 96| 0.01| 1.00| 0.99| 0.99|
## [1] "---------------------------------------------------------------------"
## [1] "stalk_color_above_ring"
##
##
## |stalk_color_above_ring | target| n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:----------------------|------:|----:|----------:|----------------:|-----------------:|-----------:|
## |b | 0.00| 432| 0.05| 0.05| 0.00| 0.03|
## |c | 0.00| 36| 0.00| 0.06| 0.05| 0.06|
## |y | 0.00| 8| 0.00| 0.06| 0.06| 0.06|
## |n | 0.04| 448| 0.06| 0.11| 0.06| 0.09|
## |p | 0.31| 1872| 0.23| 0.34| 0.11| 0.23|
## |w | 0.62| 4464| 0.55| 0.89| 0.34| 0.62|
## |g | 1.00| 576| 0.07| 0.96| 0.89| 0.93|
## |o | 1.00| 192| 0.02| 0.99| 0.96| 0.98|
## |e | 1.00| 96| 0.01| 1.00| 0.99| 0.99|
summary(
object = M
)
## class cap_shape cap_surface cap_color bruises
## Length:8124 Min. :0.00 Min. :0.0 Min. :0.00 Min. :0.00
## Class :character 1st Qu.:0.00 1st Qu.:0.0 1st Qu.:0.00 1st Qu.:0.00
## Mode :character Median :1.00 Median :0.0 Median :0.00 Median :0.00
## Mean :0.51 Mean :0.4 Mean :0.36 Mean :0.42
## 3rd Qu.:1.00 3rd Qu.:1.0 3rd Qu.:1.00 3rd Qu.:1.00
## Max. :1.00 Max. :1.0 Max. :1.00 Max. :1.00
## odor gill_attachment gill_spacing gill_size gill_color
## Min. :0.00 Min. :0.00 Min. :0.00 Min. :0.00 Min. :0.00
## 1st Qu.:0.00 1st Qu.:1.00 1st Qu.:1.00 1st Qu.:0.00 1st Qu.:0.00
## Median :1.00 Median :1.00 Median :1.00 Median :1.00 Median :0.00
## Mean :0.53 Mean :0.97 Mean :0.84 Mean :0.69 Mean :0.42
## 3rd Qu.:1.00 3rd Qu.:1.00 3rd Qu.:1.00 3rd Qu.:1.00 3rd Qu.:1.00
## Max. :1.00 Max. :1.00 Max. :1.00 Max. :1.00 Max. :1.00
## stalk_shape stalk_root stalk_surface_above_ring
## Min. :0.00 Min. :0.00 Min. :0.00
## 1st Qu.:0.00 1st Qu.:0.00 1st Qu.:0.00
## Median :1.00 Median :0.00 Median :1.00
## Mean :0.57 Mean :0.46 Mean :0.64
## 3rd Qu.:1.00 3rd Qu.:1.00 3rd Qu.:1.00
## Max. :1.00 Max. :1.00 Max. :1.00
## stalk_surface_below_ring stalk_color_above_ring stalk_color_below_ring
## Min. :0.00 Min. :0.00 Min. :0.00
## 1st Qu.:0.00 1st Qu.:0.00 1st Qu.:0.00
## Median :1.00 Median :1.00 Median :1.00
## Mean :0.61 Mean :0.66 Mean :0.65
## 3rd Qu.:1.00 3rd Qu.:1.00 3rd Qu.:1.00
## Max. :1.00 Max. :1.00 Max. :1.00
## veil_color ring_number ring_type spore_print_color population
## Min. :0.00 Min. :0.00 Min. :0.00 Min. :0.0 Min. :0.0
## 1st Qu.:1.00 1st Qu.:1.00 1st Qu.:0.00 1st Qu.:0.0 1st Qu.:0.0
## Median :1.00 Median :1.00 Median :0.00 Median :0.0 Median :1.0
## Mean :0.98 Mean :0.92 Mean :0.49 Mean :0.5 Mean :0.5
## 3rd Qu.:1.00 3rd Qu.:1.00 3rd Qu.:1.00 3rd Qu.:1.0 3rd Qu.:1.0
## Max. :1.00 Max. :1.00 Max. :1.00 Max. :1.0 Max. :1.0
## habitat target
## Min. :0.00 Min. :0.00
## 1st Qu.:0.00 1st Qu.:0.00
## Median :0.00 Median :1.00
## Mean :0.39 Mean :0.52
## 3rd Qu.:1.00 3rd Qu.:1.00
## Max. :1.00 Max. :1.00
Now the next step is feature selection and then model building. # Save prepared data We want to see the minimum = 0, max = 1, and mean between 1/3 and 2/3. But if mean is outside of that range then its ok b/c theres noly so much you can do if youve done everything.
write.csv(
x = M,
file = "/Users/thienpham/Data Mining/data/prepared_mushrooms.csv",
row.names = FALSE
)
Take your data for classification supervised learning and prepare your predictors and your target variable. You do not need to perform feature selection yet, we will do that in an upcoming assignment.